######################################################################################################
## This program estimates coefficients and counterfactual functionals ##
## Model refers to "Distribution Regression with Selection"
## Authors: Chernozhukov, V., I. Fernandez-Val and S. Luo
## Last Update: 6/17/2018

## 80s vs 90s vs 100s within gender 

## male vs female within year(s)



rm(list=ls());

library(parallel);
library(sampleSelection);
library(maxLik);
library(mvtnorm);
library(abind);
library(MASS);

#####################
# Extract Arguments #
#####################
#Define default values for variables
counter.method <- 2; 
# 1 for counterfactual decades within gender; 
# 2 for counterfactual sex within time period; 
# 3 for counterfactual 18 years within gender;
rho.spec       <- 1; 
# Specification of rho: 1 for default; 
#                       2 for education-specific rho's; 
#                       3 for couple-specific rho's;
#                       4 for time-specific rho's;
#                       5 for linear expression of X;
#                       6 for age; 
#                       7 for number of kids;
#                       8 for linear expression of years of schooling;
#                       9 for 4 education dummies;
#                       10 for 4 education dummies x couple;
#                       11 for age x couple;
#                       12 for 4 age dummies;
#                       13 for numkids x couple;
#                       14 for 4 age dummies x couple;
#                       15 for linear in t;
#                       16 for quardratic in t;
#                       17 for yrseduc x couple;
#                       18 for linear in t x couple;
#                       19 for 6 time dummies;
#                       20 for propensity score of employment;
#                       21 for pscore x couple;
#                       22 for beninc0 + m_inc0;
sample.id      <- 11378;

#Get aruments from the command line
argv <- commandArgs(TRUE);

# Check if the command line is not empty and convert values to numerical values
if(length(argv) > 0){
  counter.method <- as.numeric(argv[1]);
  rho.spec       <- as.numeric(argv[2]);
  sample.id      <- as.numeric(argv[3]);
};

str.sample <- ifelse(counter.method==1, ifelse(sample.id==1, "_m","_f"), 
                     ifelse(counter.method==2, paste("_",sample.id,sep=""), ifelse(sample.id==1, "_mhalf","_fhalf")));
str.rho.all <- c("","_educ","_couple","_time","_allx","_age","_numkids","_yrseduc","_4dum","_4dumxcouple",
                 "_agexcouple","_4agedum","_numkidsxcouple","_4agedumxcouple","_lintime","_quartime","_yrseducxcouple",
                 "_lintimexcouple","_6tdum","_pscore","_pscorexcouple","_iv");
str.rho <- str.rho.all[rho.spec];

str.robust <- ""; # "_blundell3";

counter.id <- ifelse(counter.method==1, "d80100", ifelse(counter.method==2,"sex","half"));


#######################################
# Import data from a proper directory #
#######################################
if(str.robust == "_blundell3"){
  mydata <- read.table("drs_fesdata_b3.csv", header=TRUE, sep= ",");
}else{
  mydata <- read.table("drs_fesdata.csv", header=TRUE, sep= ",");
}
mydata$age2 <- mydata$age2/100;
mydata$age3 <- mydata$age3/1000;
mydata$age4 <- mydata$age4/10000;

mydata$agep  <- poly(mydata$age,degree=4)[,1]*100;
mydata$agep2 <- poly(mydata$age,degree=4)[,2]*100;
mydata$agep3 <- poly(mydata$age,degree=4)[,3]*100;
mydata$agep4 <- poly(mydata$age,degree=4)[,4]*100;
mydata$lw[mydata$work==0] <- NA;

mydata$d80100 <- 0;
mydata$d80100[mydata$decade == 80]  <- 1;
mydata$d80100[mydata$decade == 100] <- 2;

mydata$half <- 0;
mydata$half[mydata$year >= 96] <- 1;
mydata$half[mydata$year <= 95] <- 2;

mydata$six_d <- 1;
mydata$six_d[mydata$year>=84 & mydata$year<=89] <- 2; 
mydata$six_d[mydata$year>=90 & mydata$year<=95] <- 3; 
mydata$six_d[mydata$year>=96 & mydata$year<=101] <- 4; 
mydata$six_d[mydata$year>=102 & mydata$year<=107] <- 5; 
mydata$six_d[mydata$year>=108 & mydata$year<=113] <- 6; 

mydata$age_d <- 1;
mydata$age_d[mydata$age>=30 & mydata$age<=39] <- 2;
mydata$age_d[mydata$age>=40 & mydata$age<=49] <- 3;
mydata$age_d[mydata$age>=50 & mydata$age<=59] <- 4;

mydata$year_res <- mydata$year - 77;

# mydata <- mydata[mydata$educ23==0 & mydata$educ2122==0 & mydata$educ1920==0,];
if(str.robust == "_blundell"){
  mydata <- mydata[mydata$indbeninc0 > 0,];
  mydata$indbeninc0 <- log(mydata$indbeninc0);
  mydata$indm_inc0 <- mydata$indbeninc0*mydata$couple;
}else if(str.robust == "_blundell2" | str.robust == "_blundell3"){
  mydata <- mydata[mydata$bbeninc0 > 0,];
  mydata$bbeninc0 <- log(mydata$bbeninc0);
  mydata$bm_inc0 <- mydata$bbeninc0*mydata$couple;
}

if(counter.method == 1){
  outcome.formula <- lw ~ educ16 + educ1718 + educ1920 + educ2122 + educ23 + couple + agep + agep2 + agep3 + agep4 + 
                          factor(region) + numch1 + numch2 + numch34 + numch510 + numch1116 + numch1718 + factor(t_d);
}else if(counter.method == 3){
  outcome.formula <- lw ~ educ16 + educ1718 + educ1920 + educ2122 + educ23 + couple + agep + agep2 + agep3 + agep4 + 
                     factor(region) + numch1 + numch2 + numch34 + numch510 + numch1116 + numch1718 + factor(half_d);
}else if(sample.id>=1000){
  outcome.formula <- lw ~ educ16 + educ1718 + educ1920 + educ2122 + educ23 + couple + agep + agep2 + agep3 + agep4 + 
                     factor(region) + numch1 + numch2 + numch34 + numch510 + numch1116 + numch1718 + factor(year);
}else{ # where sample is a single year;
  outcome.formula <- lw ~ educ16 + educ1718 + educ1920 + educ2122 + educ23 + couple + agep + agep2 + agep3 + agep4 + 
                     factor(region) + numch1 + numch2 + numch34 + numch510 + numch1116 + numch1718;
};

if(str.robust == ""){
  selection.formula <- update(outcome.formula, work ~ . + tubeninc0 + m_inc0);
}else if(str.robust == "_blundell"){
  selection.formula <- update(outcome.formula, work ~ . + indbeninc0 + indm_inc0);
}else if(str.robust == "_blundell2" | str.robust == "_blundell3"){
  selection.formula <- update(outcome.formula, work ~ . + bbeninc0 + bm_inc0);
}

if(rho.spec==1){
  rho.formula <- work ~ 1; # dependent variable does not mattrer;
}else if(rho.spec==2){
  rho.formula <- work ~ 0 + educ15 + educ16 + educ1718 + educ1920 + educ2122 + educ23;
}else if(rho.spec==3){
  rho.formula <- work ~ 0 + single + couple;
}else if(rho.spec==4){
  if(counter.method == 1){
    rho.formula <- work ~ 0 + factor(t_d);
  }else if(counter.method == 3){
    rho.formula <- work ~ 0 + factor(half_d);
  }else if(sample.id >= 1000){
    rho.formula <- work ~ 0 + factor(year);
  }else{
    rho.formula <- work ~ 1;
  };
}else if(rho.spec==5){
  rho.formula <- update(outcome.formula, work~.);
}else if(rho.spec==6){
  rho.formula <- work ~ age;
}else if(rho.spec==7){
  rho.formula <- work ~ numch1 + numch2 + numch34 + numch510 + numch1116 + numch1718;
}else if(rho.spec==8){
  rho.formula <- work ~ yrseduc;
}else if(rho.spec==9){
  rho.formula <- work ~ 0 + hsdrop + hs + somecol + col;
}else if(rho.spec==10){
  rho.formula <- work ~ 0 + (hsdrop + hs + somecol + col):single + (hsdrop + hs + somecol + col):couple;
}else if(rho.spec==11){
  rho.formula <- work ~ couple + age + age:couple;
}else if(rho.spec==12){
  rho.formula <- work ~ 0 + factor(age_d);
}else if(rho.spec==13){
  rho.formula <- work ~ couple + numchyt6 + numchyt6:couple;
}else if(rho.spec==14){
  rho.formula <- work ~ 0 + factor(age_d):single + factor(age_d):couple;
}else if(rho.spec==15){
  rho.formula <- work ~ year_res;
}else if(rho.spec==16){
  rho.formula <- work ~ year_res + I(year_res^2);
}else if(rho.spec==17){
  rho.formula <- work ~ yrseduc*couple;
}else if(rho.spec==18){
  rho.formula <- work ~ year_res*couple;
}else if(rho.spec==19){
  rho.formula <- work ~ 0 + factor(six_d);
}else if(rho.spec==20){
  rho.formula <- work ~ pscore;
}else if(rho.spec==21){
  rho.formula <- work ~ pscore*couple;
}else if(rho.spec==22){
  rho.formula <- work ~ tubeninc0 + m_inc0;
};


Y.name <- all.vars(outcome.formula)[1];


####################
# Parallel Setting #
####################

# Calculate the namber of cores
n.cores <- ceiling(as.numeric(Sys.getenv("NSLOTS"))/1);
if(is.na(n.cores))n.cores <- 1;

################################################
# Source the file which contains the functions #
################################################
source("DRS_Source.R");


## Counterfactual Distributions for grouped/individual years ##


# Keep data for one gender for a period (from.year ~ to.year) #
if(counter.method == 1){
  mysample <- mydata[is.element(mydata$year,c(80:89,100:109)) & mydata$sex == sample.id,];
}else if(counter.method == 3){
  mysample <- mydata[mydata$sex == sample.id,];
}else{
  if(sample.id >= 1000){
    if(sample.id >= 100000){
      from.year <- sample.id %/% 1000;
      to.year   <- sample.id - from.year*1000;
    }else{
      from.year <- sample.id %/% 100;
      to.year   <- sample.id - from.year*100;
    };
  }else{
    from.year <- sample.id;
    to.year   <- sample.id;
  }
  mysample <- mydata[is.element(mydata$year,c(from.year:to.year)),];
};
  
####################################
# Set up Threshold being estimated #
####################################
u.thres <- 0.99;
l.thres <- 0.01;
by.thres <- 0.01; 
ys <- quantile(mysample[,Y.name],seq(l.thres,u.thres,by.thres), na.rm=TRUE);
n.thres <- length(ys);




#############################
# Estimate parameters and F #
#############################
elist.counter1 <- Estimation(mysample[mysample[,counter.id]==1,], ys, 
                             outcome.formula, selection.formula, rho.formula, initial.val=numeric(0));
elist.counter2 <- Estimation(mysample[mysample[,counter.id]==2,], ys, 
                             outcome.formula, selection.formula, rho.formula, initial.val=numeric(0));

Zwc.all <- model.matrix(selection.formula, mysample);
Z.name  <- colnames(Zwc.all);

mysample$pscore <- 0;
mysample$pscore[mysample[,counter.id]==1] <- pnorm(Zwc.all[mysample[,counter.id]==1,]%*%elist.counter1$pi[1,]);
mysample$pscore[mysample[,counter.id]==2] <- pnorm(Zwc.all[mysample[,counter.id]==2,]%*%elist.counter2$pi[1,]);

Heckman.counter1 <- selection(selection=selection.formula, outcome=outcome.formula, method="2-step",
                                    data=mysample[mysample[,counter.id]==1,]); 
Heckman.counter2 <- selection(selection=selection.formula, outcome=outcome.formula, method="2-step",
                                    data=mysample[mysample[,counter.id]==2,]);
pi.Heckman.counter1 <- coef(Heckman.counter1)[1:length(Z.name)];
pi.Heckman.counter2 <- coef(Heckman.counter2)[1:length(Z.name)];
beta.Heckman.counter1 <- coef(Heckman.counter1, part="outcome");
beta.Heckman.counter2 <- coef(Heckman.counter2, part="outcome");
rho.Heckman.counter1 <- coef(Heckman.counter1)["rho"];
rho.Heckman.counter2 <- coef(Heckman.counter2)["rho"];
sigma.Heckman.counter1 <- coef(Heckman.counter1)["sigma"];
sigma.Heckman.counter2 <- coef(Heckman.counter2)["sigma"];


###########################
# Save estimation results #
###########################
save.image(paste("Est",str.sample,str.rho,str.robust,".RData",sep=""));

